Required Packages and functions
library(here)
library(tidyverse)
library(minfi)
library(ggplot2)
library(stringr)
library(limma)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
# or for EPIC
# library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0)
library(minfi)
library(DMRcate)
library(UpSetR)
# specify correct array when loading Locations
IlluminaHumanMethylation450kanno.ilmn12.hg19:data(Locations)
manhattan = function(probe, locations, colors = c("#2D728F", "#2B4162")){
probe = cbind(probe, locations[,'chr'][match(rownames(probe), rownames(locations))])
probe = cbind(probe, locations[,'pos'][match(rownames(probe), rownames(locations))])
colnames(probe)[c(ncol(probe) - 1, ncol(probe))] = c('chr', 'pos')
probe$chr = as.numeric(gsub("chr", "", probe$chr))
don = probe %>%
# Compute chromosome size
group_by(chr) %>% summarise(chr_len=as.numeric(max(pos))) %>%
# Calculate cumulative position of each chromosome
mutate(tot=cumsum(chr_len)-chr_len) %>% dplyr::select(-chr_len) %>%
# Add this info to the initial dataset
left_join(probe, ., by=c("chr"="chr")) %>%
# Add a cumulative position of each site
arrange(chr, pos) %>% mutate(poscum=pos+tot) # %>%
# Add highlight and annotation information
# mutate(is_highlight=ifelse(SNP %in% snpsOfInterest, "yes", "no")) %>%
# mutate(is_annotate_fdr=ifelse(adj.P.Val<0.05 & adj.P.Val.bonf>0.05, "yes", "no")) %>%
# mutate(is_annotate_bonf=ifelse(adj.P.Val.bonf <0.05, "yes", "no"))
# don$alpha[don$is_annotate_fdr == 'no'] = 0.6
# don$alpha[don$is_annotate_fdr == 'yes'] = 0
# Prepare X axis
axisdf = don %>% group_by(chr) %>% summarize(center=(max(poscum) + min(poscum))/2)
don = merge(don, probe[,c(1,11)], on='cpg', all.x=T)
manhattan = ggplot(don, aes(x=poscum, y=-log10(P.Value))) +
geom_point(aes(color=as.factor(chr)), size=0.8, alpha = 0.55) + scale_color_manual(values = rep(colors, dim(table(don$chr)))) +
# p-value cutoffs
geom_hline(yintercept=-log10(0.05/nrow(don)), colour = '#AB3428', size=.5) +
# geom_hline(yintercept=-log10(max(don$P.Value[don$adj.P.Val < 0.05])), colour='#AB3428', size=.4) +
# custom X axis:
scale_x_continuous(expand = c(0, 0), limits = c(min(don$poscum), max(don$poscum)), label = axisdf$chr, breaks= axisdf$center) +
scale_y_continuous(expand = c(0, 0)) +
# Custom the theme:
theme_minimal() + theme(
legend.position="none", panel.border = element_blank(), panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_line(size = 0.2, color = 'gray65')) + theme(axis.title.y = element_blank()) + theme(text = element_text(size = 7.5)) +
labs(y=expression(-log[10]*"(P-value)"), x='Chromosome')
return(manhattan)
}
lambda <- function(p) median(qchisq(p, df=1, lower.tail=FALSE), na.rm=TRUE) / qchisq(0.5, df=1)
gg_qqplot = function(pvector) {
l = round(lambda(pvector), 3)
o = -log10(sort(pvector, decreasing = FALSE))
e = -log10(ppoints(length(pvector)))
df = data.frame(o = o, e = e)
ggplot(df, aes(e, o)) + geom_point(alpha = 0.5, size = 1) + geom_abline(intercept = 0, slope = 1, color = '#AB3428') + labs(y = expression(Observed ~ ~-log[10](italic(p))), x = expression(Expected ~ ~-log[10](italic(p)))) + theme_classic() + annotate("text", x = 1, y = 5, label = paste0('lambda = ', l))
}
volcano = function(probe) {
volcano = ggplot(probe, aes(x=logFC, y = -log10(P.Value))) +
geom_point(size = 0.8, alpha=0.4) +
geom_hline(aes(yintercept = -log10(0.05/nrow(probe)),linetype = 'Bonferroni threshold'), color = "#AB3428", size = 0.5) +
# geom_hline(aes(yintercept = -log10(max(P.Value[adj.P.Val < 0.05])),linetype = 'FDR threshold'), color = "#AB3428", size = 0.3) +
scale_linetype_manual(name = '', values = c(1,2), guide = guide_legend(override.aes = list(color = c("#AB3428", "#AB3428")))) + theme_minimal() +
labs(y=expression(-log[10]*"(P-value)"), x='Effect estimate') + theme(panel.grid.minor.y = element_blank()) + theme(text = element_text(size=8)) +
theme(legend.position="none") + scale_y_continuous(expand = c(0, 0)) + theme(panel.grid.minor.x = element_blank(), panel.grid.major.y = element_line(size = 0.2, color = 'gray65'), panel.grid.major.x = element_line(size = 0.2, color = 'gray65'))
return(volcano)
}
Load Data
# setwd("/Users/annebozack/Documents/Navas/S2_121611_run 2")
# Load Data
load("fact_funnorm_combat_data.RData")
dim(Mcombat)
# 411400 48
# Additional probe filtering for cross-reactive probes and probes near SNPs using DMRcate
# Setting minimum minor allele frequency to 0.05 and maximum distance from probe to 10
McombatFilter = rmSNPandCH(Mcombat, dist = 2, mafcut = 0.05, and = TRUE, rmcrosshyb = TRUE, rmXY=FALSE)
dim(McombatFilder)
# 408765 48
# Missigness?
na_count <-sapply(pheno, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count
# LandOwn 1, FolDef 1
# Look at exposure distribution
ggplot(pheno, aes(uAsCr)) + geom_histogram(alpha = 0.7, binwidth = 50) + theme_minimal()
quartz.save('EWAS plots/uAsCr_Hist.png', type = "png", dpi = 300)
ggplot(pheno, aes(log(uAsCr))) + geom_histogram(alpha = 0.7) + theme_minimal()
quartz.save('EWAS plots/uAsCr_log_Hist.png', type = "png", dpi = 300)
ggplot(pheno, aes(log(uAsCr), fill = expGroup)) + geom_histogram(alpha = 0.7) + theme_minimal()
quartz.save('EWAS plots/uAsCr_log_groups_Hist.png', type = "png", dpi = 300)
ggplot(pheno, aes(wAs, fill = expGroup)) + geom_histogram(alpha = 0.7) + theme_minimal()
quartz.save('EWAS plots/wAs_groups_Hist.png', type = "png", dpi = 300)
ggplot(pheno, aes(bAs)) + geom_histogram(alpha = 0.7) + theme_minimal()
quartz.save('EWAS plots/bAs_Hist.png', type = "png", dpi = 300)
ggplot(pheno, aes(log(bAs))) + geom_histogram(alpha = 0.7) + theme_minimal()
quartz.save('EWAS plots/bAs_log_Hist.png', type = "png", dpi = 300)
ggplot(pheno, aes(log(bAs), fill = expGroup)) + geom_histogram(alpha = 0.7) + theme_minimal()
quartz.save('EWAS plots/bAs_log_groups_Hist.png', type = "png", dpi = 300)
uAsCr distribution
knitr::include_graphics('EWAS plots/uAsCrHist.png')

log-uAsCr distribution
knitr::include_graphics('EWAS plots/uAsCr_log_Hist.png')

log-AsCr distribution by exposure groups
knitr::include_graphics('EWAS plots/uAsCr_log_groups_Hist.png')

Water As distribution by exposure groups
knitr::include_graphics('EWAS plots/wAs_groups_Hist.png')

bAs distribution
knitr::include_graphics('EWAS plots/bAs_Hist.png')

log-bAs distribution
knitr::include_graphics('EWAS plots/bAs_log_Hist.png')

log-bAs distribution by exposure groups
knitr::include_graphics('EWAS plots/bAs_log_groups_Hist.png')

# Check Aligment
all(pheno$Sample_Name==colnames(Mcombat))
all(identical(pheno$Sample_Name,colnames(Mcombat)))
# TRUE
Correlations with celltype for exposure
require(corrplot)
col <- colorRampPalette(c("red","orangered1","aliceblue", "deepskyblue1", "blue1"))
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j],method="spearman")
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# matrix of the p-value of the correlation
M<-as.matrix(as.data.frame(pheno[,c("CD8T","CD4T","NK","Bcell","Mono","Gran","uAsCr")]))
colnames(M)<-c("CD8T","CD4T","NK","Bcell","Mono","Gran","AsCr")
p.mat <- cor.mtest(M)
## Correlogram
M <- cor(M,method="spearman")
# Correlogram
corrplot(M, method="color", col=col(1000),
type="lower", #order="hclust",
cl.lim=c(-1,1),
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, # Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = 0.05, insig = "blank",
addgrid.col="white",
# hide correlation coefficient on the principal diagonal
diag=T,
#addrect = 1,
tl.cex = 1/par("cex"),cl.cex = 1/par("cex")
)
mtext(expression("Spearman correlations"~(italic("r")["s"])),side=1,col="black",cex=0.9)
quartz.save('EWAS plots/cell_As_correlations.png', type = "png", dpi = 300)
knitr::include_graphics('EWAS plots/cell_As_correlations.png')

DMPs with limma
Continuous log-uAsCr
# Unadjusted
modUnadj = model.matrix(~log(pheno$uAsCr))
# Adjusted for cell type
modCell = model.matrix(~log(pheno$uAsCr) + pheno$CD8T + pheno$CD4T + pheno$NK + pheno$Bcell + pheno$Mono + pheno$Gran)
# Adjusted for cell type, age, smoking, and betelnut use
modCellAgeSmBet = model.matrix(~log(pheno$uAsCr) + pheno$CD8T + pheno$CD4T + pheno$NK + pheno$Bcell + pheno$Mono + pheno$Gran + pheno$age + pheno$Cig.Ever + pheno$Betel)
# Unadjusted
fit = lmFit(McombatFilter, modUnadj)
fit = eBayes(fit)
probeUnadj = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeUnadj$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeUnadj$P.Value < 0.05)
# FALSE TRUE
# 377507 31258
table(probeUnadj$adj.P.Val < 0.05)
# FALSE
# 408765
write.csv(probeUnadj, 'EWAS results/uAsCr_unadj.csv')
# Adjusted for cell type
fit = lmFit(McombatFilter, modCell)
fit = eBayes(fit)
probeCell = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeCell$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeCell$P.Value < 0.05)
# FALSE TRUE
# 384611 24154
table(probeCell$adj.P.Val < 0.05)
# FALSE
# 409409
write.csv(probeCell, 'EWAS results/uAsCr_adjCell.csv')
gg_qqplot(probeCell$P.Value)
quartz.save('EWAS plots/qq_log_uAsCr_adjCell.png', type = "png", dpi = 300)
volcano(probeCell)
quartz.save('EWAS plots/vol_log_uAsCr_adjCell.png', type = "png", dpi = 300)
manhattan(probeCell, Locations)
quartz.save('EWAS plots/man_log_uAsCr_adjCell.png', type = "png", dpi = 300)
# Adjusted for cell type, age, smoking, and betelnut use
fit = lmFit(McombatFilter, modCellAgeSmBet)
fit = eBayes(fit)
probeCellAgeSmBet = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeCellAgeSmBet$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeCellAgeSmBet$P.Value < 0.05)
# FALSE TRUE
# 385171 23594
table(probeCellAgeSmBet$adj.P.Val < 0.05)
# FALSE
# 408765
QQ plot, DMPs, continuous log-uAsCr adjusted for cell type
knitr::include_graphics('EWAS plots/qq_log_uAsCr_adjCell.png')

Volcano plot, DMPs, continuous log-uAsCr adjusted for cell type
knitr::include_graphics('EWAS plots/vol_log_uAsCr_adjCell.png')

Manhattan plot, DMPs, continuous log-uAsCr adjusted for cell type
knitr::include_graphics('EWAS plots/man_log_uAsCr_adjCell.png')

Categorical As
# Unadjusted
modUnadjCat = model.matrix(~pheno$expGroup)
# Adjusted for cell type
modCellCat = model.matrix(~pheno$expGroup + pheno$CD8T + pheno$CD4T + pheno$NK + pheno$Bcell + pheno$Mono + pheno$Gran)
# Adjusted for cell type, age, smoking, and betelnut use
modCellAgeSmBetCat = model.matrix(~pheno$expGroup + pheno$CD8T + pheno$CD4T + pheno$NK + pheno$Bcell + pheno$Mono + pheno$Gran + pheno$age + pheno$Cig.Ever + pheno$Betel)
# Unadjusted
fit = lmFit(McombatFilter, modUnadjCat)
fit = eBayes(fit)
probeUnadjCat = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeUnadjCat$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeUnadjCat$P.Value < 0.05)
# FALSE TRUE
# 3358266 50499
table(probeUnadjCat$adj.P.Val < 0.05)
# FALSE
# 408765
write.csv(probeUnadjCat, 'EWAS results/AsCategorical_unadj.csv')
# Adjusted for cell type
fit = lmFit(McombatFilter, modCellCat)
fit = eBayes(fit)
probeCellCat = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeCellCat$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeCellCat$P.Value < 0.05)
# FALSE TRUE
# 389554 19211
table(probeCellCat$adj.P.Val < 0.05)
# FALSE
# 408765
write.csv(probeUnadjCat, 'EWAS results/AsCategorical_adjCell.csv')
gg_qqplot(probeCellCat $P.Value)
quartz.save('EWAS plots/qq_catAs_adjCell.png', type = "png", dpi = 300)
volcano(probeCellCat)
quartz.save('EWAS plots/vol_catAs_adjCell.png', type = "png", dpi = 300)
manhattan(probeCellCat, Locations)
quartz.save('EWAS plots/man_catAs_adjCell.png', type = "png", dpi = 300)
# Adjusted for cell type, age, smoking, and betelnut use
fit = lmFit(McombatFilter, modCellAgeSmBetCat)
fit = eBayes(fit)
probeCellAgeSmBetCat = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeCellAgeSmBetCat$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeCellAgeSmBetCat$P.Value < 0.05)
# FALSE TRUE
# 390622 18143
table(probeCellAgeSmBetCat$adj.P.Val < 0.05)
# FALSE
# 408765
QQ plot, DMPs, categorized As adjusted for cell type
knitr::include_graphics('EWAS plots/qq_catAs_adjCell.png')

Volcano plot, DMPs, categorized As adjusted for cell type
knitr::include_graphics('EWAS plots/vol_catAs_adjCell.png')

Manhattan plot, DMPs, categorized As adjusted for cell type
knitr::include_graphics('EWAS plots/man_catAs_adjCell.png')

Continuous log-bAs
# Unadjusted
modUnadj = model.matrix(~log(pheno$bAs))
# Adjusted for cell type
modCell = model.matrix(~log(pheno$bAs) + pheno$CD8T + pheno$CD4T + pheno$NK + pheno$Bcell + pheno$Mono + pheno$Gran)
# Adjusted for cell type, age, smoking, and betelnut use
modCellAgeSmBet = model.matrix(~log(pheno$bAs) + pheno$CD8T + pheno$CD4T + pheno$NK + pheno$Bcell + pheno$Mono + pheno$Gran + pheno$age + pheno$Cig.Ever + pheno$Betel)
# Unadjusted
fit = lmFit(McombatFilter, modUnadj)
fit = eBayes(fit)
probeBAsUnadj = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeBAsUnadj$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeBAsUnadj$P.Value < 0.05)
# FALSE TRUE
# 391654 17111
table(probeBAsUnadj$adj.P.Val < 0.05)
# FALSE
# 408765
# Adjusted for cell type
fit = lmFit(McombatFilter, modCell)
fit = eBayes(fit)
probeBAsCell = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeBAsCell$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeBAsCell$P.Value < 0.05)
# FALSE TRUE
# 3390079 18686
table(probeBAsCell$adj.P.Val < 0.05)
# FALSE
# 408765
gg_qqplot(probeBAsCell$P.Value)
quartz.save('EWAS plots/qq_log_bAs_adjCell.png', type = "png", dpi = 300)
volcano(probeCell)
quartz.save('EWAS plots/vol_log_bAs_adjCell.png', type = "png", dpi = 300)
manhattan(probeCell, Locations)
quartz.save('EWAS plots/man_log_bAs_adjCell.png', type = "png", dpi = 300)
# Adjusted for cell type, age, smoking, and betelnut use
fit = lmFit(McombatFilter, modCellAgeSmBet)
fit = eBayes(fit)
probeBAsCellAgeSmBet = topTable(fit, adjust = 'BH', coef = 2, number = Inf, confint = T)
probeBAsCellAgeSmBet$adj.P.Val.bonf = topTable(fit,adjust="bonferroni",coef=2, number = Inf)$adj.P.Val
table(probeBAsCellAgeSmBet$P.Value < 0.05)
# FALSE TRUE
# 390776 17989
table(probeBAsCellAgeSmBet$adj.P.Val < 0.05)
# FALSE
# 408765
QQ plot, DMPs, continuous log-bAs adjusted for cell type
knitr::include_graphics('EWAS plots/qq_log_bAs_adjCell.png')

Volcano plot, DMPs, continuous log-bAs adjusted for cell type
knitr::include_graphics('EWAS plots/vol_log_bAs_adjCell.png')

Manhattan plot, DMPs, continuous log-bAs adjusted for cell type
knitr::include_graphics('EWAS plots/man_log_bAs_adjCell.png')

Overlap between exposure variables
listInput = list(catAsUnadj = rownames(probeUnadjCat)[probeUnadjCat$P.Value < 0.05], catAsAdj = rownames(probeCellCat)[probeCellCat$P.Value < 0.05], uAsCrUnadj = rownames(probeUnadj)[probeUnadj$P.Value < 0.05], uAsCrAdj = rownames(probeCell)[probeCell$P.Value < 0.05], bAsUnadj = rownames(probeBAsUnadj)[probeBAsUnadj$P.Value < 0.05], bAsAdj = rownames(probeBAsCell)[probeBAsCell$P.Value < 0.05])
upset(fromList(listInput), order.by = "freq", sets = c('catAsUnadj', 'catAsAdj', 'uAsCrUnadj', 'uAsCrAdj', 'bAsUnadj', 'bAsAdj'), keep.order= T, number.angles = 30, point.size = 3.5)
quartz.save('EWAS plots/upset plot.png', type = "png", dpi = 300)
Upset plot, CpGs at p < 0.05, with and without adjusting for cell type
knitr::include_graphics('EWAS plots/upset plot.png')
